Load the library package, :
library("dplyr") # Data wrangling
library("tidyverse")
library("lubridate")
library("ggplot2")
library("ggthemes")
library("gridExtra")
library("quantmod")
library("xts")
library("zoo")
library("forecast")
library("fpp")
library("fpp2")
library("tseries")
library("TSstudio")
library('IRdisplay')
library("htmlwidgets")
library("RQuantLib")
library("fpp2")
We collected the past ten years data from PJM interconnection LLC, which is a regional transmission organization, which serves thirteen states for electric transmission.
setwd('/Users/meizhuchen/Desktop/Columbia/Columbia_S2/APAN_5205/Project/')
data2012 <- read.csv("hrl_load_estimated_2012.csv")
data2013 <- read.csv("hrl_load_estimated_2013.csv")
data2014 <- read.csv("hrl_load_estimated_2014.csv")
data2015 <- read.csv("hrl_load_estimated_2015.csv")
data2016 <- read.csv("hrl_load_estimated_2016.csv")
data2017 <- read.csv("hrl_load_estimated_2017.csv")
data2018 <- read.csv("hrl_load_estimated_2018.csv")
data2019 <- read.csv("hrl_load_estimated_2019.csv")
data2020 <- read.csv("hrl_load_estimated_2020.csv")
data2021 <- read.csv("hrl_load_estimated_2021.csv")
data<-rbind(data2012,data2013,data2014,data2015,data2016,data2017,data2018,data2019,data2020,data2021)
There are 87,673 rows and 6 variables in our dataset, including datetime_beginning_utc, datetime_beginning_ept, datetime_ending_utc, datetime_ending_ept, load_area, and estimated_load_hourly.
glimpse(data)
Rows: 87,673
Columns: 6
$ datetime_beginning_utc <chr> "2/24/2012 5:00:00 AM", "2/24/2012 6:…
$ datetime_beginning_ept <chr> "2/24/2012 12:00:00 AM", "2/24/2012 1…
$ datetime_ending_utc <chr> "2/24/2012 6:00:00 AM", "2/24/2012 7:…
$ datetime_ending_ept <chr> "2/24/2012 1:00:00 AM", "2/24/2012 2:…
$ load_area <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "A…
$ estimated_load_hourly <int> 13640, 13245, 13065, 13063, 13255, 14…
Since we just need to know the datetime of energy load hourly, we don’t need the ending datetime variables, so we deleted the datetime_ending_utc and datetime_ending_ept variables. Besides, we focused on the data from the American Electric Power company in our research which covers the Eastern part of the U.S, we just need the EPT datetime, so we also deleted the datetime_beginning_utc variable.
df<-subset(data, select = c(-datetime_beginning_utc,-datetime_ending_utc, -datetime_ending_ept))
glimpse(df)
Rows: 87,673
Columns: 3
$ datetime_beginning_ept <chr> "2/24/2012 12:00:00 AM", "2/24/2012 1…
$ load_area <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "A…
$ estimated_load_hourly <int> 13640, 13245, 13065, 13063, 13255, 14…
Changed the datatype of datetime_beginning_ept variable from string to datetime and rename it as datetime
df$datetime_beginning_ept<-mdy_hms(df$datetime_beginning_ept)
colnames(df)[1]="dt"
colnames(df)[3]="energy_use"
glimpse(df)
Rows: 87,673
Columns: 3
$ dt <dttm> 2012-02-24 00:00:00, 2012-02-24 01:00:00, 2012-0…
$ load_area <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", …
$ energy_use <int> 13640, 13245, 13065, 13063, 13255, 14042, 15533, …
There is no missing values in the dataset.
Quick visualize the data to see if there is any unusual situation in the dataset.
We found out that there was an unusual energy usage at the end of 2012, so we decided to remove the data from 2012 Besides, we also noticed that there is no complete data in 2022, so we deleted the data from 2022 as well.After removal, there are 78,888 rows remaining in the dataset.
Rows: 78,888
Columns: 3
$ dt <dttm> 2013-01-01 00:00:00, 2013-01-01 01:00:00, 2013-0…
$ load_area <chr> "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", "AEP", …
$ energy_use <int> 14327, 14190, 13785, 13730, 13820, 13938, 14276, …
The data looks better than the previous one. In the end, we keep the data from 2013 to 2022 and 3 variables which include DateTime, load_area, and estimated_load_hourly.
write.table(df,file="/Users/meizhuchen/Desktop/Columbia/Columbia_S2/APAN_5205/Project//new data.csv",sep=",",row.names=F, na = "NA",append=TRUE,col.names=FALSE)
Rows: 3,287
Columns: 2
$ dt <date> 2013-01-01, 2013-01-02, 2013-01-03, 2013-01-04, …
$ energy_use <int> 363696, 422097, 434357, 428915, 390983, 366413, 4…
df2=df1
df2$dt <- format(as.Date(df1$dt), "%Y-%m")
df2<-aggregate(energy_use~dt, df2, sum)
glimpse(df2)
Rows: 108
Columns: 2
$ dt <chr> "2013-01", "2013-02", "2013-03", "2013-04", "2013…
$ energy_use <int> 12468894, 11386614, 11864582, 9933289, 10362414, …
#By day
ts1<-ts(data=df1$energy_use,
start=c(lubridate::year(min(df1$dt)), lubridate::yday(min(df1$dt))),
frequency=365)
ts_info(ts1)
The ts1 series is a ts object with 1 variable and 3287 observations
Frequency: 365
Start time: 2013 1
End time: 2022 2
#By month
df2<-subset(df2, select = c(-dt))
ts2<-ts(df2,start = c(2013,01), end = c(2021,12),frequency = 12)
class(ts2)
[1] "ts"
ts_info(ts2)
The ts2 series is a ts object with 1 variable and 108 observations
Frequency: 12
Start time: 2013 1
End time: 2021 12
ggplot(df, aes(hour, energy_use)) +
geom_bar(stat='identity', aes(fill = hour)) +
xlab("hour") +
ylab("energy")
ggplot(df1, aes(weekdays, energy_use)) +
geom_bar(stat='identity', aes(fill = weekdays)) +
xlab("weekdays") +
ylab("energy")
ggplot(df1, aes(holiday, energy_use)) +
geom_bar(stat='identity', aes(fill = holiday)) +
xlab("holiday or not") +
ylab("energy")
ts_plot(ts1, title="Hourly Energy Consumption - AEP")
ts_decompose(ts1)
ts_heatmap(ts1)
ts_quantile(df1, period="monthly")
ggseasonplot(ts2)
ggseasonplot(ts2,polar=T)
Since the test data contains 24 months, we will be constructing forecasts for 24 periods.
rbind(average_model = accuracy(f = average_model,x = ts2)[2,],
naive_model = accuracy(f = naive_model,x = ts2)[2,],
seasonal_naive_model = accuracy(f = seasonal_naive_model,x = ts2)[2,],
drift_model = accuracy(f = drift_model,x = ts2)[2,]
)
ME RMSE MAE MPE MAPE
average_model -422351.3 1067452.5 913989.7 -4.959097 9.133660
naive_model -600494.1 1149638.0 973370.6 -6.676168 9.828396
seasonal_naive_model -276488.2 573660.7 475508.8 -2.850343 4.622409
drift_model -389431.9 1072761.5 906427.9 -4.655948 9.053743
MASE ACF1 Theil's U
average_model 1.8688875 0.4432732 1.0995476
naive_model 1.9903071 0.4432732 1.1945507
seasonal_naive_model 0.9723002 -0.3452301 0.5355287
drift_model 1.8534256 0.4688047 1.1081050
autoplot(train)+
autolayer(average_model,PI = F,size=1.1,series = 'Average Model')+
autolayer(naive_model,PI=F,size=1.1, series='Naive Model')+
autolayer(seasonal_naive_model,PI=F,size=1.1,series='Seasonal Naive Model')+
autolayer(drift_model,PI=F,size=1.1,series='Drift Model')+
autolayer(test)
Forecasts are weighted averages of past observations with the weights decaying exponentially such that recent observations get weighted more than distant observations.
ses_model = ses(train,h = 24) # Simple exponential smoothing, calculated using weighted averages, most recent observations get the heaviest weight.
holt_model = holt(train,h=24) # Holt’s Method, extends simple exponential smoothing to allow the forecasting of data with a trend
holt_damped_model = holt(train,h=24,damped = T) #Holt’s Method with Damping, forecasts generally display a constant trend indefinitely into the future.
rbind(ses_model = accuracy(f = ses_model,x = ts2)[2,],
holt_model = accuracy(f = holt_model,x = ts2)[2,],
holt_damped_model = accuracy(f = holt_damped_model,x = ts2)[2,]
)
ME RMSE MAE MPE MAPE
ses_model -346500.0 1039777 890174.6 -4.227986 8.851409
holt_model -395029.8 1070486 904443.3 -4.707237 9.036634
holt_damped_model -205592.2 1002228 865494.4 -2.870435 8.509174
MASE ACF1 Theil's U
ses_model 1.820191 0.4432732 1.065192
holt_model 1.849367 0.4632745 1.105293
holt_damped_model 1.769726 0.4442958 1.013739
autoplot(train)+
autolayer(ses_model,PI = F,size=1.1,series = 'ses_model')+
autolayer(holt_model,PI=F,size=1.1, series='holt_model')+
autolayer(holt_damped_model,PI=F,size=1.1,series='holt_damped_model')
autolayer(test)
mapping: group = ~series, colour = ~series, x = ~timeVal, y = ~seriesVal
geom_line: na.rm = FALSE, orientation = NA
stat_identity: na.rm = FALSE
position_identity
ETS models in R are handled by ets() from library(forecast). Unlike functions such as naive(), ses(), hw() functions, the ets() function does not produce forecasts. Rather, it estimates the model parameters and returns information on the fitted model.
# When only the time-series is specified, and all other arguments are left at their default values, then ets() will automatically select the best model based on AICc.
ets_auto = ets(train)
summary(ets_auto)
ETS(M,N,M)
Call:
ets(y = train)
Smoothing parameters:
alpha = 0.2632
gamma = 3e-04
Initial states:
l = 11085975.5404
s = 1.0534 0.9544 0.9075 0.9479 1.0676 1.0758
0.9915 0.9199 0.8809 1.0238 1.0155 1.1619
sigma: 0.0422
AIC AICc BIC
2576.918 2583.976 2613.380
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set -14904.15 430291.9 340718.1 -0.2595381 3.110457 0.696686
ACF1
Training set 0.1788073
When only the time-series is specified, and all other arguments are left at their default values, then ets() will automatically select the best model based on AICc.
checkresiduals(ets_auto)
Ljung-Box test
data: Residuals from ETS(M,N,M)
Q* = 25.35, df = 3, p-value = 1.305e-05
Model df: 14. Total lags used: 17
ME RMSE MAE MPE MAPE
Training set -14904.15 430291.9 340718.1 -0.2595381 3.110457
Test set -306414.16 542260.1 437906.4 -3.1575038 4.275699
MASE ACF1 Theil's U
Training set 0.6966860 0.1788073 NA
Test set 0.8954126 0.1208285 0.5027521
ARIMA are the one most widely used approaches to time-series forecasting and provide complementary approaches to the problem. ARIMA models aim to describe autocorrelations in the data
adf.test(ts2,k = 0)
Augmented Dickey-Fuller Test
data: ts2
Dickey-Fuller = -6.9447, Lag order = 0, p-value = 0.01
alternative hypothesis: stationary
Use auto.arima to pick the best model based on AICc, sometimes it does not yield an optimal solution as it uses computational shortcuts, by setting stepwise and approximation to False, we will ensure a more extensive search.
model_auto = auto.arima(y = train,d = 1,D = 1,stepwise = F,approximation = F)
model_auto
Series: train
ARIMA(1,1,1)(0,1,1)[12]
Coefficients:
ar1 ma1 sma1
0.3659 -0.9522 -0.7198
s.e. 0.1358 0.0931 0.1870
sigma^2 = 2.495e+11: log likelihood = -1036.44
AIC=2080.88 AICc=2081.49 BIC=2089.93
model1 = Arima(y = train,order = c(1,1,1),seasonal = c(0,1,1),lambda = BoxCox.lambda(train))
ggtsdisplay(residuals(model1))
ME RMSE MAE MPE MAPE
Training set -14698.14 448186.7 333727.5 -0.2259307 3.062610
Test set -245988.78 515513.3 432957.8 -2.5971909 4.229228
MASE ACF1 Theil's U
Training set 0.6823920 0.005927225 NA
Test set 0.8852938 -0.034303574 0.4908497
rbind(average_model = accuracy(f = average_model,x = ts2)[2,],
naive_model = accuracy(f = naive_model,x = ts2)[2,],
seasonal_naive_model = accuracy(f = seasonal_naive_model,x = ts2)[2,],
drift_model = accuracy(f = drift_model,x = ts2)[2,],
ses_model = accuracy(f = ses_model,x = ts2)[2,],
holt_model = accuracy(f = holt_model,x = ts2)[2,],
holt_damped_model = accuracy(f = holt_damped_model,x = ts2)[2,],
ets_auto = accuracy(ets_auto_forecast,x = ts2)[2,],
arima = accuracy(model1_forecast,x= ts2)[2,]
)
ME RMSE MAE MPE MAPE
average_model -422351.3 1067452.5 913989.7 -4.959097 9.133660
naive_model -600494.1 1149638.0 973370.6 -6.676168 9.828396
seasonal_naive_model -276488.2 573660.7 475508.8 -2.850343 4.622409
drift_model -389431.9 1072761.5 906427.9 -4.655948 9.053743
ses_model -346500.0 1039777.2 890174.6 -4.227986 8.851409
holt_model -395029.8 1070485.6 904443.3 -4.707237 9.036634
holt_damped_model -205592.2 1002228.2 865494.4 -2.870435 8.509174
ets_auto -306414.2 542260.1 437906.4 -3.157504 4.275699
arima -245988.8 515513.3 432957.8 -2.597191 4.229228
MASE ACF1 Theil's U
average_model 1.8688875 0.44327322 1.0995476
naive_model 1.9903071 0.44327322 1.1945507
seasonal_naive_model 0.9723002 -0.34523015 0.5355287
drift_model 1.8534256 0.46880473 1.1081050
ses_model 1.8201914 0.44327322 1.0651918
holt_model 1.8493674 0.46327447 1.1052930
holt_damped_model 1.7697264 0.44429585 1.0137387
ets_auto 0.8954126 0.12082848 0.5027521
arima 0.8852938 -0.03430357 0.4908497
autoplot(train, color='sienna')+
autolayer(test,size=1.05,color='seagreen2')+
autolayer(average_model,series = 'Average Model',PI=F)+
autolayer(naive_model,series = 'Naive Model',PI=F)+
autolayer(seasonal_naive_model,series = 'Seasonal Naive Model',PI=F)+
autolayer(drift_model,series = 'Seasonal Naive Model',PI=F)+
autolayer(ses_model,series = 'Seasonal Naive Model',PI=F)+
autolayer(holt_model,series = 'Holt',PI=F)+
autolayer(ets_auto_forecast,series = 'ETS Auto',PI=F)+
autolayer(model1_forecast,series = 'ARIMA',PI=F)
model2 = Arima(y = ts2,order = c(1,1,1),seasonal = c(0,1,1),lambda = BoxCox.lambda(ts2))
forecast(model2,h =12)
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Jan 2022 11901306 11294253 12486455 10963026 12788203
Feb 2022 10577829 9870137 11252377 9480051 11597697
Mar 2022 10493936 9775360 11178104 9378866 11528103
Apr 2022 9035803 8225966 9795567 7772724 10180713
May 2022 9549752 8774258 10281569 8342722 10653869
Jun 2022 10357495 9629766 11049510 9227752 11403243
Jul 2022 11532923 10863426 12175056 10496495 12505105
Aug 2022 11362184 10684911 12011062 10313348 12344340
Sep 2022 9917097 9164110 10630296 8746580 10993952
Oct 2022 9421228 8637142 10160112 8200229 10535684
Nov 2022 9970138 9220106 10680886 8804408 11043402
Dec 2022 11044554 10351747 11706856 9970899 12046546